'''
    This program will read workers resumes and determine whether they ever held
    a job title that related to being a founder or in a c-suite position.
'''

print("Starting R_create_resumes_work_exp.")

import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import csv
import statsmodels.api as sm
from datetime import datetime
import string
import gc

gdSeed = "G:/Shared drives/Economic Research/Research/zzz - Jason Sockin Projects/DataGrab"
fileSeed = "C:/Users/jsock/Dropbox/Research/GD/International"

savePath = fileSeed + "/Data/"
dataPath = gdSeed + "/Resumes/"

##############################################################################
# Read in education information 
##############################################################################

# Read in domestic and international schools on resumes
resumeEduc = pd.read_csv(savePath + "Cleaned_user_schools_majors.csv")
del resumeEduc['Unnamed: 0']

resumeEduc = resumeEduc[resumeEduc.school.notnull()]

del resumeEduc['FK_resumeId']

##############################################################################
# Keep work experience data for resumes with age available
##############################################################################

allResumes = pd.DataFrame([])

for ii in range(1955,2000,5):

    currYr = ii
    endYr  = currYr + 5
    
    print("Current year is: " + str(currYr))
    
    if ii > 1980:
        resumeWorkExp  = pd.read_csv(dataPath + 'Resumes_workexp_birthYear_' + str(currYr) + '-' + str(endYr) + '_03_30_2019.csv')
    else:
        resumeWorkExp  = pd.read_csv(dataPath + 'Resumes_workexp_birthYear_' + str(currYr) + '-' + str(endYr) + '_01_03_2022.csv')
    
    del resumeWorkExp['Unnamed: 0']
    
    resumeWorkExp = resumeWorkExp.sort_values(['FK_resumeId','startDate'])
    
    resumeWorkExp['jobOrder'] = resumeWorkExp.groupby('FK_resumeId').cumcount()+1
    
    resumeJobs = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').size().reset_index())
    resumeJobs.rename(columns={0:'resumeJobs'},inplace=True)
    
    resumeWorkExp = pd.merge(resumeWorkExp, resumeJobs, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
            
    del resumeJobs
    
    ##############################################################################
    # Drop resumes that have a jobstart date that is in the future
    ##############################################################################
    
    # Drop test entries
    resumeWorkExp = resumeWorkExp[resumeWorkExp.jobTitle!="*** INVALID JOB TITLE ***"]
    
    resumeWorkExp['startDate'] = pd.to_datetime(resumeWorkExp.startDate,errors='coerce')
    
    resumeWorkExp['dateGrabData'] = pd.to_datetime(datetime.strptime('2019-08-27', '%Y-%m-%d'))
    
    resumeWorkExp['dropResumeStartDate'] = (resumeWorkExp.startDate >= resumeWorkExp.dateGrabData) * 1
    
    dropStartDate = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').dropResumeStartDate.max()).reset_index()
    dropStartDate = dropStartDate[dropStartDate.dropResumeStartDate==1]
    dropStartDate.rename(columns={'dropResumeStartDate':'dropdropStartDate'},inplace=True)
    
    resumeWorkExp = pd.merge(resumeWorkExp, dropStartDate, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
    
    resumeWorkExp = resumeWorkExp[resumeWorkExp.dropdropStartDate!=1]
    
    del resumeWorkExp['dateGrabData']
    del resumeWorkExp['dropResumeStartDate']
    del resumeWorkExp['dropdropStartDate']
    del dropStartDate
    
    print("Finished start date error.")
        
    ##############################################################################
    # Drop resumes that have negative tenure for a job
    ##############################################################################
    
    resumeWorkExp['endDate'] = pd.to_datetime(resumeWorkExp.endDate,errors='coerce')
    
    resumeWorkExp['yrsTenure'] = (resumeWorkExp.endDate-resumeWorkExp.startDate).apply(lambda x: x.days / 365)
    
    resumeWorkExp['negTenure'] = (resumeWorkExp.yrsTenure < 0) * 1
    
    dropNegTenure = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').negTenure.max())
    dropNegTenure.rename(columns={'negTenure':'dropNegTenure'},inplace=True)
    dropNegTenure = dropNegTenure[dropNegTenure.dropNegTenure==1]
    
    resumeWorkExp = pd.merge(resumeWorkExp, dropNegTenure, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
    
    resumeWorkExp = resumeWorkExp[resumeWorkExp.dropNegTenure!=1]
    
    del resumeWorkExp['negTenure']
    del resumeWorkExp['dropNegTenure']
    del dropNegTenure
    
    print("Finished negative tenure error.")
    
    ##############################################################################
    # Drop resumes that have a job with impossible tenure period
    ##############################################################################
    
    resumeWorkExp['tenureTooLong'] = (resumeWorkExp.yrsTenure > 50) * 1
    
    dropTenure = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').tenureTooLong.max())
    dropTenure.rename(columns={'tenureTooLong':'dropTenure'},inplace=True)
    dropTenure = dropTenure[dropTenure.dropTenure==1]
    
    resumeWorkExp = pd.merge(resumeWorkExp, dropTenure, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
    
    resumeWorkExp = resumeWorkExp[resumeWorkExp.dropTenure!=1]
    
    del resumeWorkExp['tenureTooLong']
    del resumeWorkExp['dropTenure']
    del dropTenure
        
    print("Finished tenure too long.")
    
    ##############################################################################
    # Keep only resumes that have job titles for all entries
    ##############################################################################
    
    resumeWorkExp['missingJobTitle'] = resumeWorkExp.jobTitle.isnull()
    
    missingJobTitle = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').missingJobTitle.sum()).reset_index()
    
    del resumeWorkExp['missingJobTitle']
    
    missingJobTitle = missingJobTitle[missingJobTitle.missingJobTitle>0]
    
    resumeWorkExp = pd.merge(resumeWorkExp, missingJobTitle, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
    
    resumeWorkExp = resumeWorkExp[resumeWorkExp.missingJobTitle.isnull()]
    
    del resumeWorkExp['missingJobTitle']
    del missingJobTitle
        
    print("Finished missing job title.")

    ##############################################################################
    # Keep only the most recent resume for each worker
    ##############################################################################
    
    lastResume = pd.DataFrame(resumeWorkExp.groupby('FK_userId').FK_resumeId.max()).reset_index()
    
    lastResume['lastResume'] = 1
    
    resumeWorkExp = pd.merge(resumeWorkExp, lastResume, how='left', left_on = ['FK_userId','FK_resumeId'], right_on = ['FK_userId','FK_resumeId'])
    
    resumeWorkExp = resumeWorkExp[resumeWorkExp.lastResume==1]
    
    del resumeWorkExp['lastResume']
        
    ##############################################################################
    # Add education information from resumes
    ##############################################################################
    
    resumeWorkExp = pd.merge(resumeWorkExp, resumeEduc, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])

    #resumeWorkExp = resumeWorkExp[resumeWorkExp.school.notnull()]

    ##############################################################################
    # Append resumes
    ##############################################################################

    allResumes = allResumes.append(resumeWorkExp)
    
    del resumeWorkExp
    
##############################################################################
# Keep work experience data for resumes with age unavailable
##############################################################################

resumeWorkExp  = pd.read_csv(dataPath + 'Resumes_workexp_noBirthYear_08_27_2019.csv')
del resumeWorkExp['Unnamed: 0']

resumeWorkExp = resumeWorkExp.sort_values(['FK_resumeId','startDate'])

resumeWorkExp['jobOrder'] = resumeWorkExp.groupby('FK_resumeId').cumcount()+1

resumeJobs = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').size().reset_index())
resumeJobs.rename(columns={0:'resumeJobs'},inplace=True)

resumeWorkExp = pd.merge(resumeWorkExp, resumeJobs, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])
        
del resumeJobs

##############################################################################
# Drop resumes that have a jobstart date that is in the future
##############################################################################

# Drop test entries
resumeWorkExp = resumeWorkExp[resumeWorkExp.jobTitle!="*** INVALID JOB TITLE ***"]

resumeWorkExp['startDate'] = pd.to_datetime(resumeWorkExp.startDate,errors='coerce')

resumeWorkExp['dateGrabData'] = pd.to_datetime(datetime.strptime('2019-08-27', '%Y-%m-%d'))

resumeWorkExp['dropResumeStartDate'] = (resumeWorkExp.startDate >= resumeWorkExp.dateGrabData) * 1

dropStartDate = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').dropResumeStartDate.max()).reset_index()
dropStartDate = dropStartDate[dropStartDate.dropResumeStartDate==1]
dropStartDate.rename(columns={'dropResumeStartDate':'dropdropStartDate'},inplace=True)

resumeWorkExp = pd.merge(resumeWorkExp, dropStartDate, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])

resumeWorkExp = resumeWorkExp[resumeWorkExp.dropdropStartDate!=1]

del resumeWorkExp['dateGrabData']
del resumeWorkExp['dropResumeStartDate']
del resumeWorkExp['dropdropStartDate']
del dropStartDate

print("Finished start date error.")
    
##############################################################################
# Drop resumes that have negative tenure for a job
##############################################################################

resumeWorkExp['endDate'] = pd.to_datetime(resumeWorkExp.endDate,errors='coerce')

resumeWorkExp['yrsTenure'] = (resumeWorkExp.endDate-resumeWorkExp.startDate).apply(lambda x: x.days / 365)

resumeWorkExp['negTenure'] = (resumeWorkExp.yrsTenure < 0) * 1

dropNegTenure = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').negTenure.max())
dropNegTenure.rename(columns={'negTenure':'dropNegTenure'},inplace=True)
dropNegTenure = dropNegTenure[dropNegTenure.dropNegTenure==1]

resumeWorkExp = pd.merge(resumeWorkExp, dropNegTenure, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])

resumeWorkExp = resumeWorkExp[resumeWorkExp.dropNegTenure!=1]

del resumeWorkExp['negTenure']
del resumeWorkExp['dropNegTenure']
del dropNegTenure

print("Finished negative tenure error.")

##############################################################################
# Drop resumes that have a job with impossible tenure period
##############################################################################

resumeWorkExp['tenureTooLong'] = (resumeWorkExp.yrsTenure > 50) * 1

dropTenure = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').tenureTooLong.max())
dropTenure.rename(columns={'tenureTooLong':'dropTenure'},inplace=True)
dropTenure = dropTenure[dropTenure.dropTenure==1]

resumeWorkExp = pd.merge(resumeWorkExp, dropTenure, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])

resumeWorkExp = resumeWorkExp[resumeWorkExp.dropTenure!=1]

del resumeWorkExp['tenureTooLong']
del resumeWorkExp['dropTenure']
del dropTenure
    
print("Finished tenure too long.")

##############################################################################
# Keep only resumes that have job titles for all entries
##############################################################################

resumeWorkExp['missingJobTitle'] = resumeWorkExp.jobTitle.isnull()

missingJobTitle = pd.DataFrame(resumeWorkExp.groupby('FK_resumeId').missingJobTitle.sum()).reset_index()

del resumeWorkExp['missingJobTitle']

missingJobTitle = missingJobTitle[missingJobTitle.missingJobTitle>0]

resumeWorkExp = pd.merge(resumeWorkExp, missingJobTitle, how='left', left_on = ['FK_resumeId'], right_on = ['FK_resumeId'])

resumeWorkExp = resumeWorkExp[resumeWorkExp.missingJobTitle.isnull()]

del resumeWorkExp['missingJobTitle']
del missingJobTitle
    
print("Finished missing job title.")

##############################################################################
# Keep only the most recent resume for each worker
##############################################################################

lastResume = pd.DataFrame(resumeWorkExp.groupby('FK_userId').FK_resumeId.max()).reset_index()

lastResume['lastResume'] = 1

resumeWorkExp = pd.merge(resumeWorkExp, lastResume, how='left', left_on = ['FK_userId','FK_resumeId'], right_on = ['FK_userId','FK_resumeId'])

resumeWorkExp = resumeWorkExp[resumeWorkExp.lastResume==1]

del resumeWorkExp['lastResume']
    
##############################################################################
# Add education information from resumes
##############################################################################

resumeWorkExp = pd.merge(resumeWorkExp, resumeEduc, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])

#resumeWorkExp = resumeWorkExp[resumeWorkExp.school.notnull()]

##############################################################################
# Append resumes
##############################################################################

allResumes = allResumes.append(resumeWorkExp)

#allResumes = allResumes[allResumes.school.notnull()]

del resumeWorkExp

del allResumes['jobOrder']
del allResumes['resumeJobs']
del allResumes['yrsTenure']
del allResumes['industryName']
del allResumes['yearFounded']

allResumes.to_csv(savePath + "Resume_work_experience.csv")

#allResumes = pd.read_csv(savePath + "Resume_work_experience.csv")

##############################################################################
# For each worker determine if job title has specific keywords
##############################################################################

allResumes['low_job_title'] = allResumes.jobTitle.apply(lambda x: str(x).lower())

allResumes['founder'] = 0
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('founder')>0),'founder'] = 1

# German
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('gründer')>0),'founder'] = 1
# Spanish/Portugeuese
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('fundador')>0),'founder'] = 1
# Italian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('fondatore')>0),'founder'] = 1
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('fondatrice')>0),'founder'] = 1
# French
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('fondateur')>0),'founder'] = 1
# Polish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('założyciel')>0),'founder'] = 1
# Turkish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('kurucu')>0),'founder'] = 1
# Chinese
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('创办人')>0),'founder'] = 1
# Russian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('основатель')>0),'founder'] = 1
# Indonesian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('pendiri')>0),'founder'] = 1
# Romanian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('fondator')>0),'founder'] = 1
# Dutch
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('oprichter')>0),'founder'] = 1
# Czech
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('zakladatel')>0),'founder'] = 1
# Swedish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('grundare')>0),'founder'] = 1
# Hungarian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count("alapító")>0),'founder'] = 1

allResumes['csuite'] = 0
allResumes.loc[(allResumes.low_job_title.apply(lambda x : str(x).count('chief')>0)) & (allResumes.low_job_title.apply(lambda x : str(x).count('officer')>0) ),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('CEO')>0),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('CFO')>0),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('CIO')>0),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('COO')>0),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('CMO')>0),'csuite'] = 1
allResumes.loc[allResumes.jobTitle.apply(lambda x : str(x).count('CTO')>0),'csuite'] = 1

# German
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('geschäftsführer')>0),'csuite'] = 1   
# Spanish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('director ejecutivo')>0),'csuite'] = 1
# Italian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('amministratore delegato')>0),'csuite'] = 1
# Portugeuese
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('diretor executivo')>0),'csuite'] = 1
# Polish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('dyrektor generalny')>0),'csuite'] = 1
# Romanian
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('director executiv')>0),'csuite'] = 1
# Dutch
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('directeur')>0),'csuite'] = 1
# Czech
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('výkonný ředitel')>0),'csuite'] = 1
# Swedish
allResumes.loc[allResumes.low_job_title.apply(lambda x : str(x).count('verkställande direktör')>0),'csuite'] = 1

##############################################################################
# Save worker characteristics
##############################################################################

worker_char = allResumes.groupby('FK_userId')['founder','csuite'].max().reset_index()

worker_char.to_csv(savePath + "User_founder_csuite.csv")

#worker_char = pd.read_csv(savePath + "User_founder_csuite.csv")
#del worker_char['Unnamed: 0']

print("Finished R_create_resumes_work_exp.")
